1 Introduction

1.1 Load the data

# Load data for each year
hi_2015 <- read.csv("/Users/shrijamittal/Downloads/happiness/2015.csv")
hi_2016 <- read.csv("/Users/shrijamittal/Downloads/happiness/2016.csv")
hi_2017 <- read.csv("/Users/shrijamittal/Downloads/happiness/2017.csv")
hi_2018 <- read.csv("/Users/shrijamittal/Downloads/happiness/2018.csv")
hi_2019 <- read.csv("/Users/shrijamittal/Downloads/happiness/2019.csv")
hi_2020 <- read.csv("/Users/shrijamittal/Downloads/happiness/2020.csv")
hi_2021 <- read.csv("/Users/shrijamittal/Downloads/happiness/2021.csv")
hi_2022 <- read.csv("/Users/shrijamittal/Downloads/happiness/2022.csv")

1.2 Clean the data

# The structure of the data for each year is different, hence it has to be aligned before each year is merged together

hi_2015_c <- hi_2015 %>% 
  
  # Drop SE as this factors are not used to calculate the 
  select(-Standard.Error) %>% 
  
  # Drop region, so we only use the region structure from 2021 (latest available)
  select(-Region) %>% 

  # Rename the headers to have the same name for each year
  rename(Rank = Happiness.Rank,
         Score = Happiness.Score,
         SocialSupport = Family, # Family is called Social Support in later years
         GDP_per_cap = Economy..GDP.per.Capita.,
         Healthy_Life_Exp = Health..Life.Expectancy.,
         Corruption = Trust..Government.Corruption.,
         Dystopia = Dystopia.Residual) %>% 
  
  # Create year col.
  mutate(Year = 2015) %>% 
  
  # Change the position to align each year
  select(Year, Rank, Country, Score, GDP_per_cap, SocialSupport, Healthy_Life_Exp, Freedom, Corruption, Generosity, Dystopia)
  

hi_2016_c <- hi_2016 %>% 
  
  # Drop Lower and Upper CI 
  select(-Lower.Confidence.Interval, -Upper.Confidence.Interval) %>% 

  # Drop region, so we only use the region structure from 2021 (latest available)
  select(-Region) %>% 

  # Rename the headers to have the same name for each year
  rename(Rank = Happiness.Rank,
         Score = Happiness.Score,
         SocialSupport = Family, # Family is called Social Support in later years
         GDP_per_cap = Economy..GDP.per.Capita.,
         Healthy_Life_Exp = Health..Life.Expectancy.,
         Corruption = Trust..Government.Corruption.,
         Dystopia = Dystopia.Residual) %>% 
  
  # Create year col.
  mutate(Year = 2016) %>% 
  
  # Change the position to align each year
  select(Year, Rank, Country, Score, GDP_per_cap, SocialSupport, Healthy_Life_Exp, Freedom, Corruption, Generosity, Dystopia)
  

hi_2017_c <- hi_2017 %>% 
  
  # Drop Lower and Upper CI/Whiskers and Family
  select(-Whisker.high, -Whisker.low) %>% 

  # Rename the headers to have the same name for each year
  rename(Rank = Happiness.Rank,
         Score = Happiness.Score,
         SocialSupport = Family, # Family is called Social Support in later years
         GDP_per_cap = Economy..GDP.per.Capita.,
         Healthy_Life_Exp = Health..Life.Expectancy.,
         Corruption = Trust..Government.Corruption.,
         Dystopia = Dystopia.Residual) %>% 
  
  # Create year col.
  mutate(Year = 2017) %>% 
  
  # Change the position to align each year
  select(Year, Rank, Country, Score, GDP_per_cap, SocialSupport, Healthy_Life_Exp, Freedom, Corruption, Generosity, Dystopia)
  

hi_2018_c <- hi_2018 %>% 
  
  # Rename the headers to have the same name for each year
  rename(Rank = Overall.rank,
         Country = Country.or.region,
         GDP_per_cap = GDP.per.capita,
         Healthy_Life_Exp = Healthy.life.expectancy,
         Corruption = Perceptions.of.corruption,
         Freedom = Freedom.to.make.life.choices,
         SocialSupport = Social.support) %>% 
  
  # Add Dystopia, which is the difference between sum of all values and the Score
  mutate(Dystopia = Score - rowSums(select(., 4:9))) %>% 
  
  # Create year col.
  mutate(Year = 2018) %>% 
  
  # Change the position to align each year
  select(Year, Rank, Country, Score, GDP_per_cap, SocialSupport, Healthy_Life_Exp, Freedom, Corruption, Generosity, Dystopia)

  
hi_2019_c <- hi_2019 %>% 
  
  # Rename the headers to have the same name for each year
  rename(Rank = Overall.rank,
         Country = Country.or.region,
         GDP_per_cap = GDP.per.capita,
         Healthy_Life_Exp = Healthy.life.expectancy,
         Corruption = Perceptions.of.corruption,
         Freedom = Freedom.to.make.life.choices,
         SocialSupport = Social.support) %>% 

  # Add Dystopia, which is the difference between sum of all values and the Score
  mutate(Dystopia = Score - rowSums(select(., 4:9))) %>% 
  
  # Create year col.
  mutate(Year = 2019) %>% 
  
  # Change the position to align each year
  select(Year, Rank, Country, Score, GDP_per_cap, SocialSupport, Healthy_Life_Exp, Freedom, Corruption, Generosity, Dystopia)



hi_2020_c <- hi_2020 %>% 
 
  # Drop Region and input values used to calcualte each attributing factor (making it comparable with previous years) 
  select(-c(2, 4:13)) %>% 
  
  # Rename the headers to have the same name for each year
  rename(Country = Country.name,
         Score = Ladder.score,
         GDP_per_cap = Explained.by..Log.GDP.per.capita,
         Healthy_Life_Exp = Explained.by..Healthy.life.expectancy,
         Corruption = Explained.by..Perceptions.of.corruption,
         Freedom = Explained.by..Freedom.to.make.life.choices,
         SocialSupport = Explained.by..Social.support,
         Generosity = Explained.by..Generosity,
         Dystopia = Dystopia...residual) %>% 
  
  # Create rank column
  arrange(desc(Score)) %>%
  
  mutate(Rank = row_number()) %>% 
  
  # Create year col.
  mutate(Year = 2020) %>% 
  
  # Change the position to align each year
  select(Year, Rank, Country, Score, GDP_per_cap, SocialSupport, Healthy_Life_Exp, Freedom, Corruption, Generosity, Dystopia)


hi_2021_c <- hi_2021 %>% 
 
  # Drop Region and input values used to calcualte each attributing factor (making it comparable with previous years) 
  select(-c(2, 4:13)) %>% 
  
  # Rename the headers to have the same name for each year
  rename(Country = Country.name,
         Score = Ladder.score,
         GDP_per_cap = Explained.by..Log.GDP.per.capita,
         Healthy_Life_Exp = Explained.by..Healthy.life.expectancy,
         Corruption = Explained.by..Perceptions.of.corruption,
         Freedom = Explained.by..Freedom.to.make.life.choices,
         SocialSupport = Explained.by..Social.support,
         Generosity = Explained.by..Generosity,
         Dystopia = Dystopia...residual) %>% 
  
  # Create rank column
  arrange(desc(Score)) %>%
  mutate(Rank = row_number()) %>% 
  
  # Create year col.
  mutate(Year = 2021) %>% 
  
  # Change the position to align each year
  select(Year, Rank, Country, Score, GDP_per_cap, SocialSupport, Healthy_Life_Exp, Freedom, Corruption, Generosity, Dystopia)


hi_2022_c <- hi_2022 %>% 
  
  # Drop high/low CI
  select(-Whisker.high, -Whisker.low) %>% 
  
  # Rename the headers to have the same name for each year
  rename(Rank = RANK,
         Score = Happiness.score,
         GDP_per_cap = Explained.by..GDP.per.capita,
         Healthy_Life_Exp = Explained.by..Healthy.life.expectancy,
         Corruption = Explained.by..Perceptions.of.corruption,
         Freedom = Explained.by..Freedom.to.make.life.choices,
         SocialSupport = Explained.by..Social.support,
         Generosity = Explained.by..Generosity,
         Dystopia = Dystopia..1.83....residual) %>% 
  
  # Create year col.
  mutate(Year = 2022) %>% 
  
  # Change the position to align each year
  select(Year, Rank, Country, Score, GDP_per_cap, SocialSupport, Healthy_Life_Exp, Freedom, Corruption, Generosity, Dystopia)
  

# Add each dataset together
happiness <- rbind(hi_2015_c, hi_2016_c, hi_2017_c, hi_2018_c, hi_2019_c, hi_2020_c, hi_2021_c, hi_2022_c) %>% 
  
  # Fix names for countries
  mutate(Country = ifelse(Country == "Guatemala*", "Guatemala", Country),
         Country = ifelse(Country == "Madagascar*", "Madagascar", Country),
         Country = ifelse(Country == "Trinidad & Tobago", "Trinidad and Tobago", Country))


# Create regions dataset
regions <- hi_2020 %>% 
  select(c(1:2)) %>% 
  rename(Country = Country.name,
         Region = Regional.indicator) %>% 
  mutate(Region = ifelse(Region == "Commonwealth of Independent States", "Independent States (CIS)", Region))

# Join regions to the dataset
happiness <- happiness %>% 
  left_join(regions,
            by = "Country")

# Check rows with NA in regions and assign a region to these countries
no_region <- happiness %>% 
  select(Country, Region) %>% 
  filter(is.na(Region)) %>% 
  distinct()

# Find region for Countries in 2015 and add this to countries with no region
regions15 <- hi_2015 %>% 
  select(c(1:2))

no_region2 <- no_region %>% 
  select(-Region) %>% 
  left_join(regions15,
            by = "Country") %>% 
  
  # Add regions to the few remaining countries
  mutate(Region = ifelse(Country == "Puerto Rico", "Latin America and Caribbean", Region),
         Region = ifelse(Country == "Belize", "Latin America and Caribbean", Region),
         Region = ifelse(Country == "North Macedonia", "Central and Eastern Europe", Region),
         Region = ifelse(Country == "Eswatini", "Sub-Saharan Africa", Region))

# Join the missing regions to the dataset
happiness_clean <- happiness %>%
  left_join(no_region2, by = "Country", suffix = c("", "_nr2")) %>%
  mutate(Region = ifelse(is.na(Region_nr2), Region, Region_nr2)) %>%
  select(-Region_nr2) %>% 
  
  # Move up the region col next to Country
  select(Country, Region, Year, Rank, everything()) %>% 
  
  # Combine Region names to decrease # of regions
  mutate(Region = ifelse(Region == "Southern Asia", "South Asia", Region),
         Region = ifelse(Region == "Middle East and Northern Africa", "Middle East and North Africa", Region))

# Skim the data to check that the data now has the intended structure
skim(happiness_clean)
Data summary
Name happiness_clean
Number of rows 1230
Number of columns 12
_______________________
Column type frequency:
character 2
numeric 10
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Country 0 1 4 24 0 166 0
Region 0 1 9 28 0 10 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Year 0 1 2018.45 2.28 2015.00 2016.00 2018.00 2020.00 2022.00 ▇▃▇▃▇
Rank 0 1 77.42 44.49 1.00 39.00 77.00 116.00 158.00 ▇▇▇▇▇
Score 0 1 5.43 1.12 2.40 4.58 5.41 6.23 7.84 ▁▅▇▇▃
GDP_per_cap 0 1 0.98 0.44 0.00 0.67 1.01 1.30 2.21 ▃▅▇▃▁
SocialSupport 0 1 1.03 0.33 0.00 0.82 1.07 1.27 1.64 ▁▂▆▇▅
Healthy_Life_Exp 0 1 0.61 0.24 0.00 0.44 0.64 0.79 1.14 ▂▅▇▇▂
Freedom 0 1 0.44 0.15 0.00 0.34 0.46 0.56 0.74 ▁▃▆▇▃
Corruption 0 1 0.13 0.11 0.00 0.06 0.10 0.16 0.59 ▇▃▁▁▁
Generosity 0 1 0.20 0.12 0.00 0.12 0.19 0.26 0.84 ▇▇▂▁▁
Dystopia 0 1 2.04 0.57 0.18 1.69 2.06 2.42 3.84 ▁▃▇▃▁
# The data now seems to be aligned and cleaned. We can now start making visualizations (YEESS!!)

# write.csv(happiness_clean, "happiness_clean.csv", row.names = FALSE)

2 Visualizations

theme_set(theme_minimal(base_family = "Lato"))

2.1 Change over time

text_label1 <- "Central and Eastern Europe increased\nmean happiness by 13%"

last_points <- happiness_clean %>% 
  group_by(Region, Year) %>% 
  summarise(mean = mean(Score)) %>% 
  filter(Year == 2022) %>%
  ungroup()

happiness_clean %>% 
  group_by(Region, Year) %>% 
  summarise(mean = mean(Score)) %>%
  
  mutate(name_lab = if_else(Year == 2022, Region, NA_character_)) %>% 
  
  ggplot(aes(x = Year, y = mean, color = Region))+
  geom_line(size = .8)+
  
  theme_minimal()+
  # Tidy up grid-lines
  theme(
  # The length of the axis ticks is increased.
  axis.ticks.length.x = unit(1.3, "lines"),
  axis.ticks.length.y = unit(.7, "lines"),
  
  # Remove the minor grid lines on x axis
  panel.grid.minor.x = element_blank(),
  
  # Customize margin values (top, right, bottom, left)
  plot.margin = margin(10, 40, 10, 40),
  
    # Customize title appearence
  plot.title = element_text(
    color = "grey10", 
    size = 20, 
    face = "bold",
    margin = margin(t = 5)
  ),
  # Customize subtitle appearence
  plot.subtitle = element_markdown(
    color = "grey30", 
    size = 14,
    lineheight = 1,
    margin = margin(t = 5, b = 10)
  ),
  # Title and caption are going to be aligned
  plot.title.position = "plot",
  plot.caption.position = "plot",
  plot.caption = element_text(
    color = "grey30", 
    size = 9,
    lineheight = 1.2, 
    hjust = 0,
    margin = margin(t = 10)
  ),
  # Remove legend
  legend.position = "none",
  
  axis.text.x = element_text(size = 12),
  
  axis.text.y = element_text(size = 12)
)+

  
  # Tiy up the format and add text
  labs(
    title = "Western Europe and North America and ANZ remains the leaders in mean happiness",
    subtitle = "Central and Eastern Europe has had the highest increase in mean happiness in the period with a 13% increase.<br>Interestingly, Covid-19 seems to have had no meaningful impact on mean happiness across regions, apart from South Asia,<br>which dropped by 4% from 2019 to 2022",
    caption = "Source: World Happiness Index, 2015-2022",
    color = "Region",
    x = NULL,
    y = "Mean Happiness Score"
  )+
  
  # Add text to the right of the lines
  geom_text_repel(
    aes(color = Region, label = name_lab),
    family = "Lato",
    fontface = "bold",
    size = 4.5,
    direction = "y",
    xlim = c(2022.2, NA),
    hjust = 0,
    segment.size = .7,
    segment.alpha = .5,
    segment.linetype = "dotted",
    box.padding = .4,
    segment.curvature = -0.1,
    segment.ncp = 3,
    segment.angle = 20
  )  +
  
  ## coordinate system + scales
  coord_cartesian(
    clip = "off",
    ylim = c(3.5, 7.5)
  ) +
  scale_x_continuous(
    expand = c(0, 0),
    limits = c(2015, 2024.3), 
    breaks = seq(2015, 2022, by = 2)
  )+
  
   scale_y_continuous(
    expand = c(0, 0))+
  
  scale_color_carto_d(name = "Region", palette = "Earth", type = "diverging") +

    #add a curve to draw attention to a value
    geom_curve(
      data = data.frame(x = 2020.8, y = 6.25, xend = 2021.8, yend = 6.13),
      mapping = aes(x = x, y = y, xend = xend, yend = yend),
      colour = "tomato",
      size = 0.5,
      curvature = -0.4,
      arrow = arrow(length = unit(2, "mm"), type = "closed"),
      inherit.aes = FALSE
    ) +
    
    # add the text label on the graph
    geom_text(
      data = data.frame(x = 2020, y = 6.4, label = text_label1),
      aes(x = x, y = y, label = text_label1),
      colour="black",
      family="Lato",
      hjust = 0.5,
      lineheight = .8,
      inherit.aes = FALSE,
      size = 4
    )

2.2 Top risers and fallers

# Top 20 nations with the highest growth in happiness score and the highest decline/lowest growth

# Calculate the change in happiness score
score_change <- happiness_clean %>% 
  filter(Year %in% c(2015, 2022)) %>% 
  group_by(Country, Region) %>% 
  summarise(Change = diff(Score))

# Find the top 20 countries with the highest growth
top_20 <- score_change %>% 
  arrange(desc(Change)) %>% 
  ungroup() %>% 
  slice(1:20) 

# Highlight countries in Sub-Saharan Africa and Central and Eastern Europe
highlights_top <- top_20 %>% 
  filter(Region == "Sub-Saharan Africa" | Region == "Central and Eastern Europe") %>% 
  select(Region) %>% 
  pull()

top_colors <- c('#9C6F32', 'grey60', '#6597A3')


top_20_plot <-top_20 %>% 
  # Create a group for coloring
  mutate(group = ifelse(Region %in% highlights_top, Region, "Other Regions")) %>% 

  # Create the graph
  ggplot(aes(x = Change, y = reorder(Country, Change), fill = group)) +
  
  # Add vertical gridlines
  geom_vline(
    xintercept = seq(0.5, 1.5, by = 0.5),
    color = "grey91", 
    size = .6
  )+
  
  geom_col() +
    
  # Place the datalabels inside the graph
  geom_text(aes(label = Country),
              position = position_stack(vjust = 0.01),
              hjust = 0, color = "white",
              fontface = 'bold') +
  theme_minimal() +
  
  # Choose the selected colors
  scale_fill_manual(values = top_colors) +
  
  theme(
    axis.text.y = element_blank(),
    panel.grid = element_blank(),
    
      # Customize margin values (top, right, bottom, left)
  plot.margin = margin(10, 40, 10, 40),
  
  # Move legend box
  legend.position = c(1, .50),
  legend.justification = c("right", "top"),
  
    # Customize title appearence
  plot.title = element_text(
    color = "grey10", 
    size = 16, 
    face = "bold",
    margin = margin(t = 5)
  ),
  # Customize subtitle appearence
  plot.subtitle = element_markdown(
    color = "grey30", 
    size = 11,
    lineheight = 1,
    margin = margin(t = 5, b = 10)
  ),
  # Title and caption are going to be aligned
  plot.title.position = "plot",
  plot.caption.position = "plot",
  plot.caption = element_text(
    color = "grey30", 
    size = 9,
    lineheight = 1.2, 
    hjust = 0,
    margin = margin(t = 10)
  ))+
  
  labs(
    title = "C&E Europe and Sub-Saharan Africa make up 75% of top 20 risers",
    subtitle = "7 out of 17 countries in Central and Eastern Europe are among the top 20 risers,contributing to making<br>C&E Europe the highest growing region. Interestingly, Sub-Saharan Africa countries account for 40% of the<br>top 20 risers, however the region saw little growth on an mean level, indicating polarization in the region",
    caption = "Source: World Happiness Index, 2015-2022",
    y = NULL,
    x = "Change in Happiness Score from 2015 to 2022",
    fill = "Region"
  )


# Find the bottom 20 countries with the highest decline
bottom_20 <- score_change %>% 
  arrange(Change) %>% 
  ungroup() %>% 
  slice(1:20)

# Highlight countries in Sub-Saharan Africa and Latin America and Caribbean
highlights_bottom <- bottom_20 %>% 
  filter(Region == "Sub-Saharan Africa" | Region == "Latin America and Caribbean") %>% 
  select(Region) %>% 
  pull()

bottom_colors <- c('#CFBC8B', 'grey60', '#6597A3')

bottom_20_plot <- bottom_20 %>% 
  # Create a group for coloring
  mutate(group = ifelse(Region %in% highlights_bottom, Region, "Other Regions")) %>% 

  # Create the graph
  ggplot(aes(x = Change, y = reorder(Country, -Change,), fill = group)) +
  
  # Add vertical gridlines
  geom_vline(
    xintercept = seq(-0.5, -2, by = -0.5),
    color = "grey91", 
    size = .6
  )+
  
  geom_col() +
    
  # Place the datalabels inside the graph
  geom_text(aes(label = Country),
              position = position_stack(vjust = 0.98),
              hjust = 1, color = "white",
              fontface = 'bold') +
  theme_minimal() +
  
  # Choose the selected colors
  scale_fill_manual(values = bottom_colors) +
  
  theme(
    axis.text.y = element_blank(),
    panel.grid = element_blank(),
    
  # Customize margin values (top, right, bottom, left)
  plot.margin = margin(10, 40, 10, 40),
  
  # Move legend box
  legend.position = c(.35, .50),
  legend.justification = c("right", "top"),
  
    # Customize title appearence
  plot.title = element_text(
    color = "grey10", 
    size = 16, 
    face = "bold",
    margin = margin(t = 5)
  ),
  # Customize subtitle appearence
  plot.subtitle = element_markdown(
    color = "grey30", 
    size = 11,
    lineheight = 1,
    margin = margin(t = 5, b = 10)
  ),
  # Title and caption are going to be aligned
  plot.title.position = "plot",
  plot.caption.position = "plot",
  plot.caption = element_text(
    color = "grey30", 
    size = 9,
    lineheight = 1.2, 
    hjust = 0,
    margin = margin(t = 10)
  ))+
  
  labs(
    title = "Interestingly, Sub-Saharan Africa also make up 35% of top 20 fallers",
    subtitle = "When exaiming the top 20 risers and fallers, the polarization in Sub-Saharan countries becomes evident,<br>with the region holding 40% of the top 20 risers and 35% of the top 20 fallers. Latin American & Caribbianas<br>well as South/Southeast Asia also include most of the top fallers in the period.",
    caption = "Source: World Happiness Index, 2015-2022",
    y = NULL,
    x = "Change in Happiness Score from 2015 to 2022",
    fill = "Region"
  )

top_20_plot

bottom_20_plot

c1_colors <- c('grey60', '#6597A3')
c2_colors <- c('grey60', '#E1DBB0')


ivory_coast <- happiness_clean %>% 
  
  # Select the intended country
  filter(Country == "Ivory Coast") %>% 
  
  # Select the two years we are interested in
  filter(Year %in% c(2015, 2022)) %>% 
  
  # Drop unnecessary cols and pivot longer to enable graph creation
  select(-Region, -Rank, -Score) %>% 
  pivot_longer(c(3:9),
               names_to = "factors",
               values_to = "factor_score") %>% 
  
  # Change order 
  mutate(factors = fct_reorder(factors, factor_score, .desc = TRUE)) %>% 
  
  # Make ggplot showing change in factors contributing to the calculation of happiness score
  ggplot(aes(x = factors, y = factor_score, fill = as.factor(Year)))+
  
  # Make format of the bars
  geom_bar(stat = "identity", position = "dodge")+
  
  # Choose the selected colors
  scale_fill_manual(values = c1_colors) +

  theme(
  # Customize margin values (top, right, bottom, left)
  plot.margin = margin(10, 40, 10, 40),
  
  # Clean up gridlines
  panel.grid.major.x = element_blank(),
  panel.grid.minor.y = element_blank(),
  
    # Customize title appearence
  plot.title = element_text(
    color = "grey10", 
    size = 16, 
    face = "bold",
    margin = margin(t = 5)
  ),
  # Customize subtitle appearence
  plot.subtitle = element_markdown(
    color = "grey30", 
    size = 11,
    lineheight = 1,
    margin = margin(t = 5, b = 10)
  ),
  # Title and caption are going to be aligned
  plot.title.position = "plot",
  plot.caption.position = "plot",
  plot.caption = element_text(
    color = "grey30", 
    size = 9,
    lineheight = 1.2, 
    hjust = 0,
    margin = margin(t = 10)
  ))+
  
  labs(
    title = "Dystopia and GDP per Capita Drove the Increase in Ivory Coast's Happiness Score",
    subtitle = "Ivory Coast was the country with the highest increase in Happiness Score from 2015 to 2022.<br>The increase was driven by large improvement in GDP per capita, healthy life expectancy, and dystopia.",
    caption = "Source: World Happiness Index, 2015-2022",
    y = "Score of for each Factor",
    x = "Factors making up the Happiness Score",
    fill = "Year"
  )



lebanon <- happiness_clean %>% 
  
  # Select the intended country
  filter(Country == "Lebanon") %>% 
  
  # Select the two years we are interested in
  filter(Year %in% c(2015, 2022)) %>% 
  
  # Drop unnecessary cols and pivot longer to enable graph creation
  select(-Region, -Rank, -Score) %>% 
  pivot_longer(c(3:9),
               names_to = "factors",
               values_to = "factor_score") %>% 
  
  # Change order 
  mutate(factors = fct_reorder(factors, factor_score, .desc = TRUE)) %>% 
  
  # Make ggplot showing change in factors contributing to the calculation of happiness score
  ggplot(aes(x = factors, y = factor_score, fill = as.factor(Year)))+
  
  # Make format of the bars
  geom_bar(stat = "identity", position = "dodge")+
  
  # Choose the selected colors
  scale_fill_manual(values = c2_colors) +

  theme(
  # Customize margin values (top, right, bottom, left)
  plot.margin = margin(10, 40, 10, 40),
  
  # Clean up gridlines
  panel.grid.major.x = element_blank(),
  panel.grid.minor.y = element_blank(),
  
    # Customize title appearence
  plot.title = element_text(
    color = "grey10", 
    size = 16, 
    face = "bold",
    margin = margin(t = 5)
  ),
  # Customize subtitle appearence
  plot.subtitle = element_markdown(
    color = "grey30", 
    size = 11,
    lineheight = 1,
    margin = margin(t = 5, b = 10)
  ),
  # Title and caption are going to be aligned
  plot.title.position = "plot",
  plot.caption.position = "plot",
  plot.caption = element_text(
    color = "grey30", 
    size = 9,
    lineheight = 1.2, 
    hjust = 0,
    margin = margin(t = 10)
  ))+
  
  labs(
    title = "Despite an increase in GDP per Capita, Lebanon Suffered a Severe Drop in Happiness Score",
    subtitle = "When exaiming Lebanon change in happiness score, the change was mostly driven by a substantialdrop in dystopia. <br>All other factors also decreased slightly, hence Lebanon suffered a drop in overall happiness score, <br>despite having an increase in GDP per Capita",
    caption = "Source: World Happiness Index, 2015-2022",
    y = "Score of for each Factor",
    x = "Factors making up the Happiness Score",
    fill = "Year"
  )

ivory_coast

lebanon

##Graph: Violin

#y = GDP_per_cap or life expectancy
#x = region

colours = c("#515B64", "#e50000")
regions = c("Western Europe", "North America and ANZ")
happiness_clean %>% 
  
  #create variable to apply separate colours.
  mutate(top2 = ifelse(Region %in% regions, TRUE, FALSE)) %>% 
  #violin plot with box plot inside
  ggplot(aes(x = Region, y=GDP_per_cap), color = top2) + geom_violin(aes(colour = top2),trim=FALSE, size = 1) + geom_boxplot(width=0.1)+
  labs(
    #str_wrap removes the hanging indent in the second line of title and subtitle.
    title = str_wrap(
      "North America and Western Europe are ahead of others in GDP per capita, with lower change between countries in these regions compared to others",
      indent = 0),
    subtitle = str_wrap("Economic strength is considered to be one of the biggest contributors to 
    happiness, which cements these countries' rankings on top of the happiness ranking charts",indent = 0),
    x = "Region",
    y = "GDP per capita"
  ) + scale_colour_manual(values = colours) + 
  #angling text a little and adjusting so they don't overlap or clip into the graph
  theme(axis.text.x = element_text(size=8, angle=25, vjust = 0.6)) + 
  #making font size identical to previous graph
  theme(legend.position = "none") + theme(plot.title = element_text(
    color = "grey10", 
    size = 16, 
    face = "bold"))

  #theme(axis.text.x = element_text(size=8, angle=25, vjust = 0.6))

2.3 Scatter correlation

library(ggpubr) #need this to ggarrange

colours <- c("#872341","#ED5AB3","#001B79")

pGDP <- happiness_clean %>% 
  filter(Year >= 2020) %>% 
  ggplot(aes(x = GDP_per_cap, y = Score, group = Year, colour = as.factor(Year))) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE) +
  scale_colour_manual(values = colours) +
  labs(
    y = "Happiness Score",
    x = "GDP per capita"
  )
pSocial <- happiness_clean %>% 
  filter(Year >= 2020) %>% 
  ggplot(aes(x = SocialSupport, y = Score, group = Year, colour  = as.factor(Year))) + 
  geom_point() +
  stat_smooth(method = "lm", se = FALSE) +
  scale_colour_manual(values = colours) +
    labs(
    y = "Happiness Score",
    x = "Social Support"
  )
pLife <- happiness_clean %>% 
  filter(Year >= 2020) %>% 
  ggplot(aes(x = Healthy_Life_Exp, y = Score, group = Year, colour  = as.factor(Year))) + 
  geom_point() +
  stat_smooth(method = "lm", se = FALSE) +
  scale_colour_manual(values = colours) +
    labs(
    y = "Happiness Score",
    x = "Life Expectancy"
    )
pFree <- happiness_clean %>% 
  filter(Year >= 2020) %>% 
  ggplot(aes(x = Freedom, y = Score, group = Year, colour  = as.factor(Year))) + 
  geom_point() +
  stat_smooth(method = "lm", se = FALSE) +
  scale_colour_manual(values = colours)+
    labs(
    y = "Happiness Score",
    x = "Freedom"
    )
pCorr <- happiness_clean %>% 
  filter(Year >= 2020) %>% 
  ggplot(aes(x = Corruption, y = Score, group = Year, colour  = as.factor(Year))) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE) +
  scale_colour_manual(values = colours)+
    labs(
    y = "Happiness Score",
    x = "Perception of Corruption"
    )
pGene <- happiness_clean %>% 
  filter(Year >= 2020) %>% 
  ggplot(aes(x = Generosity, y = Score, group = Year, colour  = as.factor(Year))) + 
  geom_point() +
  stat_smooth(method = "lm", se = FALSE) +
  scale_colour_manual(values = colours)+
    labs(
    y = "Happiness Score",
    x = "Generosity"
    )

#this is used to change legend title to "Year" from "as.factor(Year)". 
pGDP <- pGDP + guides(color = guide_legend(title = "Year"))
pSocial <- pSocial + guides(color = guide_legend(title = "Year"))
pLife <- pLife + guides(color = guide_legend(title = "Year"))
pFree <- pFree + guides(color = guide_legend(title = "Year"))
pCorr <- pCorr + guides(color = guide_legend(title = "Year"))
pGene <- pGene + guides(color = guide_legend(title = "Year"))


title <- "Some Variables Have Greater Effect on World Happiness Rankings"
subtitle <- "With Generosity carrying a noticeably lower impact, GDP per capita,
Healthy Life Expectancy and Social Support are the main drivers of happiness"

(pGDP + pSocial + pLife) / (pFree + pCorr + pGene) +
    plot_layout(guides = "collect") +
  plot_annotation(
    title = title,
    subtitle = subtitle,
    tag_levels = 'A',
    theme = theme(plot.title = element_text(hjust = 0, size = 14, face = "bold")) 
  )

2.4 Social Support focus

colours2 = c("#872341","#ED5AB3")

pSocial2 <- happiness_clean %>% 
  filter(Year == 2020 | Year == 2021) %>% 
  ggplot(aes(x = Score, y = SocialSupport, group = Year, colour  = as.factor(Year))) + 
  geom_point() +
  stat_smooth(method = "lm", se = FALSE) +
  scale_colour_manual(values = colours2) +
    labs(
    y = "Social Support",
    x = "Happiness Score"
  )

#this is used to change legend title to "Year" from "as.factor(Year)". 
pSocial2 <- pSocial2 + guides(color = guide_legend(title = "Year"))



title <- "Social Support decreased noticeably during Covid"
subtitle <- "Quarantine and sedentary living seem to have affected social bonds
negatively with people feeling more isolated."

pSocial2 +
  plot_annotation(
    title = title,
    subtitle = subtitle,
    theme = theme(plot.title = element_text(hjust = 0, size = 14, face = "bold")) 
  )

2.5 Healthy Life Expectancy

# Create a new dataframe grouped by countries and years
grouped_data <- happiness_clean %>%
  mutate(Country = fct_reorder(Country, Healthy_Life_Exp, median))

# Choose the top 20 countries with the highest life expectancy
top_countries <- grouped_data %>%
  group_by(Year) %>%
  slice_max(order_by = Healthy_Life_Exp, n = 15)

# Filter the grouped_data to include only the top 20 countries for each year
grouped_data_filtered <- grouped_data %>%
  filter(Country %in% top_countries$Country)

# Create a heatmap for the top 20 countries
ggplot(grouped_data_filtered, aes(x = Year, y = Country, fill = Healthy_Life_Exp)) +
  geom_tile() +
  scale_fill_viridis_c(option = "A") +
  theme_minimal(base_size = 12) +
  # scale_fill_gradient(low = "#CFBC8B", high = "#6597A3") +

  # Customize plot appearance
  theme(
    axis.text.x = element_text(size = 8),  # Adjust x-axis label size
    axis.text.y = element_text(size = 8),
    axis.title.x = element_text(size = 9),  # Adjust x-axis title size
    axis.title.y = element_text(size = 9),   # Adjust y-axis title size
    legend.title = element_text(size = 9),

    # Customize title appearance
    plot.title = element_text(
      color = "grey10", 
      size = 14, 
      face = "bold",
      margin = margin(t = 5)
    ),

    # Customize subtitle appearance
    plot.subtitle = element_markdown(
      color = "grey30", 
      size = 10,
      lineheight = 1,
      margin = margin(t = 5, b = 10)
    ),

    # Title and caption alignment
    plot.title.position = "plot",
    plot.caption.position = "plot",

    # Customize caption appearance
    plot.caption = element_text(
      color = "grey30", 
      size = 9,
      lineheight = 1.2, 
      hjust = 0,
      margin = margin(t = 10)
    )
  ) +

  # Set plot labels and fill legend
  labs(
    title = "Impact of Global COVID-19 Pandemic on Life Expectancy: How Life \nExpectancy Drastically Changed in 2021 for Top 20 Happiest Countries",
    subtitle = "Examining the data unveiled a substantial rise in global mortality rates amid the COVID-19 pandemic,<br> leading to a widespread decrease in global life expectancy and signaling an unparalleled impact on our <br>collective health.",
    caption = "Source: World Happiness Index, 2015-2022",
    x = "Year-on-year change in Life Expectancy",
    fill = "Life Expectancy"
  )

2.6 Top 20 countries

# Filter the happiness_clean dataset for the year 2022, arrange by Rank, and select the top 20 entries
df_temp <- happiness_clean %>% 
  filter(Year == 2022) %>% 
  arrange(Rank) %>% 
  head(20)

# Create a scatter plot using ggplot for the top 20 countries in 2022
ggplot(df_temp, aes(x = GDP_per_cap, y = Corruption, size = SocialSupport)) +
  geom_point(alpha = 0.7, aes(color = Region)) +
  scale_size_continuous(range = c(3, 15)) +  # Adjust the size range as needed
  theme_minimal() +
  scale_color_brewer(palette = "Set2") +
  geom_text(aes(label = Country), size = 3, vjust = 1, hjust = 0.5) +
  scale_x_continuous(limits = c(1.75, 2.25)) +

  # Customize plot appearance
  theme(
    axis.text.x = element_text(size = 8),  # Adjust x-axis label size
    axis.text.y = element_text(size = 8),
    axis.title.x = element_text(size = 9),  # Adjust x-axis title size
    axis.title.y = element_text(size = 9),   # Adjust y-axis title size
    legend.title = element_text(size = 9),

    # Customize title appearance
    plot.title = element_text(
      color = "grey10", 
      size = 14, 
      face = "bold",
      margin = margin(t = 5)
    ),

    # Customize subtitle appearance
    plot.subtitle = element_markdown(
      color = "grey30", 
      size = 9.5,
      lineheight = 1,
      margin = margin(t = 5, b = 10)
    ),

    # Title and caption alignment
    plot.title.position = "plot",
    plot.caption.position = "plot",

    # Customize caption appearance
    plot.caption = element_text(
      color = "grey30", 
      size = 9,
      lineheight = 1.2, 
      hjust = 0,
      margin = margin(t = 10)
    )
  ) +

  # Set plot labels and fill legend
  labs(
    title = "Israel and Czech Republic Bridge the Gap to the top 20 with High Social Support, \nAgainst Prior Belief of the Need for High GDP per Capita",
    subtitle = "While Western European countries typically show high GDP per capita and social support, Israel and Czech <br>Republic challenge this trend. Despite a lower GDP per capita, Israel and Czech Republic maintain remarkable <br>social support, challenging the belief that economic wealth alone drives strong social bonds.",
    caption = "Source: World Happiness Index, 2015-2022",
    x = "GDP per capita",
    y = "Corruption",
    size = "SocialSupport"
  ) 

2.7 Map for 2015

# Plot the world map with happiness scores
world_map <- map_data("world")
# Replace "USA" and "UK" with "United States" and "United Kingdom" in world_map
world_map$region[world_map$region == "USA"] <- "United States"
world_map$region[world_map$region == "UK"] <- "United Kingdom"

# Merge happiness data with world map data
merged_data <- merge(world_map, happiness_clean, by.x = "region", by.y = "Country", all.x = TRUE) %>%
  filter(Year == 2015)

# Find missing regions
missing_regions <- setdiff(world_map$region, merged_data$region)

# Create a separate data frame for missing regions with a dummy value (e.g., -1) for Score
missing_data <- data.frame(region = missing_regions, Score = -1, Year = 2015)

# Merge missing_data with merged_data
merged_data <- bind_rows(merged_data, missing_data)
na_color <- "gray"
# Plot the world map with data
plot1 <- ggplot(merged_data, aes(map_id = region, fill = Score)) +
  geom_map(map = world_map, color = "black", linewidth = 0.1) +
  expand_limits(x = world_map$long, y = world_map$lat) +
  theme_void() +
  scale_fill_carto_c(name = "region",
                           type = "diverging", palette = "Earth", direction = -1, na.value = na_color,
                     limits = c(0, max(merged_data$Score, na.rm = TRUE))) +
  labs(
    title = "Map of World with Happiness Index Score of 2015",
    subtitle = "This visualization represents the global happiness index scores for the year 2015. <br>The intensity of color corresponds to the happiness score, <br>with darker shades indicating higher scores. Notably, the top rankings are mostly <br>Western European countries, while countries in Sub-Saharan Africa often show lower index scores.",
    caption = "Source: World Happiness Index, 2015-2022",
    color = "Region",
  ) +
  theme(
    plot.title = element_text(
    color = "grey10", 
    size = 16, 
    face = "bold",
    margin = margin(t = 5),
  ),
  # Customize subtitle appearence
  plot.subtitle = element_markdown(
    color = "grey30", 
    size = 11,
    lineheight = 1,
    margin = margin(t = 5, b = 10)
  ),
  # Title and caption are going to be aligned
  plot.title.position = "plot",
  legend.position = "none",
  plot.caption.position = "plot", 
  plot.caption = element_text(
    color = "grey30",
    size = 9,
    lineheight = 1.2,
    hjust = 0,
    margin = margin(t = 10)
  )
  ) +
    theme(plot.margin = margin(10, 40, 10, 40)) 

plot1

2.8 Map for 2022

# Create new dataset for 2022 
merged_data_2022 <- merge(world_map, happiness_clean , by.x = "region", by.y = "Country", all.x = TRUE) %>%
  filter(Year == 2022)

# Find missing regions
missing_regions_2022 <- setdiff(world_map$region, merged_data_2022$region)

# Create a separate data frame for missing regions with a dummy value (e.g., -1) for Score
missing_data_2022 <- data.frame(region = missing_regions_2022, Score = -1, Year = 2022)

# Merge missing_data with merged_data
merged_data_2022 <- bind_rows(merged_data_2022, missing_data_2022)
na_color <- "gray"

# Plot map for 2022
plot2 <- ggplot(merged_data_2022, aes(map_id = region, fill = Score)) +
  geom_map(map = world_map, color = "black", linewidth = 0.1) +
  expand_limits(x = world_map$long, y = world_map$lat) +
  theme_void() +
  scale_fill_carto_c(name = "region",
                           type = "diverging", palette = "Earth", direction = -1, na.value = na_color,
                     limits = c(0, max(merged_data$Score, na.rm = TRUE))) +
  labs(
    title = "Map of World with Happiness Index Score of 2022",
    subtitle = "This visualization represents the global happiness index scores for the year 2022. <br>The intensity of color corresponds to the happiness score, <br>with darker shades indicating higher scores. Notably, in 2022, the top rankings are all <br>Western European countries, while countries in Sub-Saharan Africa remain to show relatively <br>lower index scores, but the country with the lowest score (Afghanistan) belongs to South Asia, <br>which may be due to the influence of the Taliban.",
    caption = "Source: World Happiness Index, 2015-2022",
    color = "Region",
  ) +
  theme(
    plot.title = element_text(
    color = "grey10", 
    size = 16, 
    face = "bold",
    margin = margin(t = 5),
  ),
  # Customize subtitle appearence
  plot.subtitle = element_markdown(
    color = "grey30", 
    size = 11,
    lineheight = 1,
    margin = margin(t = 5, b = 10)
  ),
  # Title and caption are going to be aligned
  plot.title.position = "plot",
  legend.position = "none",
  plot.caption.position = "plot", 
  plot.caption = element_text(
    color = "grey30",
    size = 9,
    lineheight = 1.2,
    hjust = 0,
    margin = margin(t = 10)
  )
  ) +
   theme(plot.margin = margin(10, 40, 10, 40)) 

plot2